In [1]:
import pandas as pd
import os 
In [2]:
os.getcwd()
Out[2]:
'C:\\Users\\belwa\\Documents\\Practice'
In [3]:
os.chdir("C:\\Users\\belwa\\OneDrive\\Documents\\0Files\\Learning\\Acmegrade\\Jul 23 DS Day 16\\Projects\\Detection of Parkinsons Disease\\")
In [4]:
df = pd.read_csv("parkinsons.data")
df
Out[4]:
name MDVP:Fo(Hz) MDVP:Fhi(Hz) MDVP:Flo(Hz) MDVP:Jitter(%) MDVP:Jitter(Abs) MDVP:RAP MDVP:PPQ Jitter:DDP MDVP:Shimmer ... Shimmer:DDA NHR HNR status RPDE DFA spread1 spread2 D2 PPE
0 phon_R01_S01_1 119.992 157.302 74.997 0.00784 0.00007 0.00370 0.00554 0.01109 0.04374 ... 0.06545 0.02211 21.033 1 0.414783 0.815285 -4.813031 0.266482 2.301442 0.284654
1 phon_R01_S01_2 122.400 148.650 113.819 0.00968 0.00008 0.00465 0.00696 0.01394 0.06134 ... 0.09403 0.01929 19.085 1 0.458359 0.819521 -4.075192 0.335590 2.486855 0.368674
2 phon_R01_S01_3 116.682 131.111 111.555 0.01050 0.00009 0.00544 0.00781 0.01633 0.05233 ... 0.08270 0.01309 20.651 1 0.429895 0.825288 -4.443179 0.311173 2.342259 0.332634
3 phon_R01_S01_4 116.676 137.871 111.366 0.00997 0.00009 0.00502 0.00698 0.01505 0.05492 ... 0.08771 0.01353 20.644 1 0.434969 0.819235 -4.117501 0.334147 2.405554 0.368975
4 phon_R01_S01_5 116.014 141.781 110.655 0.01284 0.00011 0.00655 0.00908 0.01966 0.06425 ... 0.10470 0.01767 19.649 1 0.417356 0.823484 -3.747787 0.234513 2.332180 0.410335
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
190 phon_R01_S50_2 174.188 230.978 94.261 0.00459 0.00003 0.00263 0.00259 0.00790 0.04087 ... 0.07008 0.02764 19.517 0 0.448439 0.657899 -6.538586 0.121952 2.657476 0.133050
191 phon_R01_S50_3 209.516 253.017 89.488 0.00564 0.00003 0.00331 0.00292 0.00994 0.02751 ... 0.04812 0.01810 19.147 0 0.431674 0.683244 -6.195325 0.129303 2.784312 0.168895
192 phon_R01_S50_4 174.688 240.005 74.287 0.01360 0.00008 0.00624 0.00564 0.01873 0.02308 ... 0.03804 0.10715 17.883 0 0.407567 0.655683 -6.787197 0.158453 2.679772 0.131728
193 phon_R01_S50_5 198.764 396.961 74.904 0.00740 0.00004 0.00370 0.00390 0.01109 0.02296 ... 0.03794 0.07223 19.020 0 0.451221 0.643956 -6.744577 0.207454 2.138608 0.123306
194 phon_R01_S50_6 214.289 260.277 77.973 0.00567 0.00003 0.00295 0.00317 0.00885 0.01884 ... 0.03078 0.04398 21.209 0 0.462803 0.664357 -5.724056 0.190667 2.555477 0.148569

195 rows × 24 columns

Understanding Data¶

In [5]:
df.head() #head returns top 5 rows of data
Out[5]:
name MDVP:Fo(Hz) MDVP:Fhi(Hz) MDVP:Flo(Hz) MDVP:Jitter(%) MDVP:Jitter(Abs) MDVP:RAP MDVP:PPQ Jitter:DDP MDVP:Shimmer ... Shimmer:DDA NHR HNR status RPDE DFA spread1 spread2 D2 PPE
0 phon_R01_S01_1 119.992 157.302 74.997 0.00784 0.00007 0.00370 0.00554 0.01109 0.04374 ... 0.06545 0.02211 21.033 1 0.414783 0.815285 -4.813031 0.266482 2.301442 0.284654
1 phon_R01_S01_2 122.400 148.650 113.819 0.00968 0.00008 0.00465 0.00696 0.01394 0.06134 ... 0.09403 0.01929 19.085 1 0.458359 0.819521 -4.075192 0.335590 2.486855 0.368674
2 phon_R01_S01_3 116.682 131.111 111.555 0.01050 0.00009 0.00544 0.00781 0.01633 0.05233 ... 0.08270 0.01309 20.651 1 0.429895 0.825288 -4.443179 0.311173 2.342259 0.332634
3 phon_R01_S01_4 116.676 137.871 111.366 0.00997 0.00009 0.00502 0.00698 0.01505 0.05492 ... 0.08771 0.01353 20.644 1 0.434969 0.819235 -4.117501 0.334147 2.405554 0.368975
4 phon_R01_S01_5 116.014 141.781 110.655 0.01284 0.00011 0.00655 0.00908 0.01966 0.06425 ... 0.10470 0.01767 19.649 1 0.417356 0.823484 -3.747787 0.234513 2.332180 0.410335

5 rows × 24 columns

In [6]:
df.tail() #tail returns bottom 5 rows of data
Out[6]:
name MDVP:Fo(Hz) MDVP:Fhi(Hz) MDVP:Flo(Hz) MDVP:Jitter(%) MDVP:Jitter(Abs) MDVP:RAP MDVP:PPQ Jitter:DDP MDVP:Shimmer ... Shimmer:DDA NHR HNR status RPDE DFA spread1 spread2 D2 PPE
190 phon_R01_S50_2 174.188 230.978 94.261 0.00459 0.00003 0.00263 0.00259 0.00790 0.04087 ... 0.07008 0.02764 19.517 0 0.448439 0.657899 -6.538586 0.121952 2.657476 0.133050
191 phon_R01_S50_3 209.516 253.017 89.488 0.00564 0.00003 0.00331 0.00292 0.00994 0.02751 ... 0.04812 0.01810 19.147 0 0.431674 0.683244 -6.195325 0.129303 2.784312 0.168895
192 phon_R01_S50_4 174.688 240.005 74.287 0.01360 0.00008 0.00624 0.00564 0.01873 0.02308 ... 0.03804 0.10715 17.883 0 0.407567 0.655683 -6.787197 0.158453 2.679772 0.131728
193 phon_R01_S50_5 198.764 396.961 74.904 0.00740 0.00004 0.00370 0.00390 0.01109 0.02296 ... 0.03794 0.07223 19.020 0 0.451221 0.643956 -6.744577 0.207454 2.138608 0.123306
194 phon_R01_S50_6 214.289 260.277 77.973 0.00567 0.00003 0.00295 0.00317 0.00885 0.01884 ... 0.03078 0.04398 21.209 0 0.462803 0.664357 -5.724056 0.190667 2.555477 0.148569

5 rows × 24 columns

In [7]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              195 non-null    object 
 1   MDVP:Fo(Hz)       195 non-null    float64
 2   MDVP:Fhi(Hz)      195 non-null    float64
 3   MDVP:Flo(Hz)      195 non-null    float64
 4   MDVP:Jitter(%)    195 non-null    float64
 5   MDVP:Jitter(Abs)  195 non-null    float64
 6   MDVP:RAP          195 non-null    float64
 7   MDVP:PPQ          195 non-null    float64
 8   Jitter:DDP        195 non-null    float64
 9   MDVP:Shimmer      195 non-null    float64
 10  MDVP:Shimmer(dB)  195 non-null    float64
 11  Shimmer:APQ3      195 non-null    float64
 12  Shimmer:APQ5      195 non-null    float64
 13  MDVP:APQ          195 non-null    float64
 14  Shimmer:DDA       195 non-null    float64
 15  NHR               195 non-null    float64
 16  HNR               195 non-null    float64
 17  status            195 non-null    int64  
 18  RPDE              195 non-null    float64
 19  DFA               195 non-null    float64
 20  spread1           195 non-null    float64
 21  spread2           195 non-null    float64
 22  D2                195 non-null    float64
 23  PPE               195 non-null    float64
dtypes: float64(22), int64(1), object(1)
memory usage: 36.7+ KB

WHAT IS PARKINSON'S DISEASE?¶

Parkinson's disease is a neurodegenerative disorder that primarily affects movement. It occurs when nerve cells (neurons) in the brain that produce dopamine, a chemical messenger responsible for smooth and coordinated muscle movements, become impaired or die. This leads to symptoms such as tremors, stiffness, slowness of movement, and difficulties with balance and coordination. Parkinson's disease is chronic and progressive, meaning it worsens over time, but treatments are available to help manage its symptoms.

WHAT IS MDVP?¶

  • MDVP stands for Multidimensional Voice Programme
  • MDVP:Fo(Hz) - average rate at which vocal cords vibrate, measured in hertz
  • MDVP: Fhi(Hz) - highest rate at which vocal cords vibrate,
  • MDVP: Flo(Hz) - lowerst rate at which vocal cords vibrate,
  • MDVP: Jitter(%) -indicates degree of irregularity in the vocal cord vibrations, which can affect the smoothness of speech.
  • MDVP: Jitter(Abs) -absolute difference between consecutive periods of the fundamental frequency, measured in microseconds.
  • MDVP: RAP - "Relative Average Perturbation" calculated based on the average difference between consecutive periods of the fundamental frequency, expressed as a percentage.
  • MDVP: PPQ - "Five-Point Period Perturbation Quotient" calculated based on the average absolute difference between consecutive periods of the fundamental frequency, expressed as a percentage.
  • Jitter: DDP - represents the average difference between jitter cycles, calculated in microseconds.
  • MDVP:Shimmer - measure of variation in the amplitude of the voice signal, which indicates the degree of irregularity or flutter in the vocal cord vibrations.
  • MDVP:Shimmer(dB) - This is the amplitude variation measured in decibels (dB), providing a more standardized measure of shimmer.
  • Shimmer:APQ3 - "Amplitude Perturbation Quotient" and is calculated based on the average absolute difference between consecutive amplitude peaks over a specified time interval (e.g., 3 milliseconds).
  • Shimmer:APQ5 - Similar to Shimmer:APQ3, but calculated over a longer time interval (e.g., 5 milliseconds), providing a measure of medium-term amplitude variation.
  • MDVP:APQ - Amplitude Perturbation Quotient" and is a measure of the average absolute difference between the amplitudes of consecutive periods of the voice signal.
  • Shimmer:DDA - represents the average difference between consecutive shimmer cycles, calculated in milliseconds.
  • NHR -
  • NHR (Noise-to-Harmonics Ratio): This is a measure of the ratio of noise to tonal components in the voice. It indicates the amount of noise present in the voice signal relative to the harmonic (tonal) components.

  • HNR (Harmonics-to-Noise Ratio): This is the inverse of NHR and represents the ratio of harmonic (tonal) components to noise in the voice. Higher values indicate a clearer, more harmonic-rich voice signal.

  • RPDE (Recurrence Period Density Entropy): This is a measure of the nonlinear dynamical complexity of the voice signal. It quantifies the rate of recurrence of patterns in the signal, reflecting its predictability and complexity.

  • D2 (Correlation dimension): This is another measure of the nonlinear dynamical complexity of the voice signal. It quantifies the number of independent degrees of freedom in the signal, providing insights into its underlying dynamics.

  • DFA (Detrended Fluctuation Analysis): This is a method used to analyze the fractal properties of the voice signal. DFA calculates the fractal scaling exponent, which describes how the fluctuation of the signal changes with the length of the observation window. It provides information about the long-range correlation properties of the signal.

In [ ]:
 
In [8]:
df.shape
Out[8]:
(195, 24)
In [9]:
len(df)
Out[9]:
195
In [10]:
df.dtypes
Out[10]:
name                 object
MDVP:Fo(Hz)         float64
MDVP:Fhi(Hz)        float64
MDVP:Flo(Hz)        float64
MDVP:Jitter(%)      float64
MDVP:Jitter(Abs)    float64
MDVP:RAP            float64
MDVP:PPQ            float64
Jitter:DDP          float64
MDVP:Shimmer        float64
MDVP:Shimmer(dB)    float64
Shimmer:APQ3        float64
Shimmer:APQ5        float64
MDVP:APQ            float64
Shimmer:DDA         float64
NHR                 float64
HNR                 float64
status                int64
RPDE                float64
DFA                 float64
spread1             float64
spread2             float64
D2                  float64
PPE                 float64
dtype: object
In [11]:
df.describe().to_excel("Summary.xlsx")
In [12]:
# status - health status of the subject (one) - Parkinson's, (zero) – healthy
In [13]:
import matplotlib.pyplot as plt
In [14]:
df.status.hist()
plt.xlabel("status")
plt.ylabel("No. of Patients")
plt.plot()
plt.show()
In [15]:
import pandas as pd
from ydata_profiling import ProfileReport
In [16]:
ProfileReport(df)
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]
Out[16]:

In [17]:
plt.figure(figsize=(10,6))
df.status.hist()
plt.xlabel("Status")
plt.ylabel("Frequencies")
Out[17]:
Text(0, 0.5, 'Frequencies')
In [18]:
plt.figure(figsize=(10,6))
plt.bar(df.status, df.NHR)
plt.xlabel("STATUS")
plt.ylabel("NHR")
Out[18]:
Text(0, 0.5, 'NHR')
In [19]:
import seaborn as sns
plt.figure(figsize=(10,6))
sns.barplot(x = "status", y = "NHR", data = df)
plt.show()
In [20]:
import seaborn as sns
plt.figure(figsize=(10,6))
sns.barplot(x = "status", y = "HNR", data =df)
plt.show()
In [21]:
import seaborn as sns
plt.figure(figsize = (10,6))
sns.barplot(x = "status" , y = "RPDE", data = df)
plt.show()
In [22]:
import warnings
warnings.filterwarnings('ignore')
rows = 3
columns = 7
fig, ax = plt.subplots(nrows = rows , ncols = columns, figsize = (16,7))
col = df.columns
index = 1
for i in range(rows):
    for j in range(columns):
        sns.distplot(df[col[index]], ax = ax[i,j])
        index = index + 1
        
plt.tight_layout()
plt.show()
In [23]:
corr = df.iloc[:,1:].corr()
In [24]:
plt.figure(figsize = (20,10))
sns.heatmap(corr, xticklabels = corr.columns, yticklabels = corr.columns, annot = True)
plt.show()
In [25]:
plt.figure(figsize = (20,10))
sns.heatmap(corr)
plt.show()
In [26]:
#droping the name column
#df = df.iloc[:,1:]
df.drop(['name'], axis = 1, inplace = True)
df
Out[26]:
MDVP:Fo(Hz) MDVP:Fhi(Hz) MDVP:Flo(Hz) MDVP:Jitter(%) MDVP:Jitter(Abs) MDVP:RAP MDVP:PPQ Jitter:DDP MDVP:Shimmer MDVP:Shimmer(dB) ... Shimmer:DDA NHR HNR status RPDE DFA spread1 spread2 D2 PPE
0 119.992 157.302 74.997 0.00784 0.00007 0.00370 0.00554 0.01109 0.04374 0.426 ... 0.06545 0.02211 21.033 1 0.414783 0.815285 -4.813031 0.266482 2.301442 0.284654
1 122.400 148.650 113.819 0.00968 0.00008 0.00465 0.00696 0.01394 0.06134 0.626 ... 0.09403 0.01929 19.085 1 0.458359 0.819521 -4.075192 0.335590 2.486855 0.368674
2 116.682 131.111 111.555 0.01050 0.00009 0.00544 0.00781 0.01633 0.05233 0.482 ... 0.08270 0.01309 20.651 1 0.429895 0.825288 -4.443179 0.311173 2.342259 0.332634
3 116.676 137.871 111.366 0.00997 0.00009 0.00502 0.00698 0.01505 0.05492 0.517 ... 0.08771 0.01353 20.644 1 0.434969 0.819235 -4.117501 0.334147 2.405554 0.368975
4 116.014 141.781 110.655 0.01284 0.00011 0.00655 0.00908 0.01966 0.06425 0.584 ... 0.10470 0.01767 19.649 1 0.417356 0.823484 -3.747787 0.234513 2.332180 0.410335
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
190 174.188 230.978 94.261 0.00459 0.00003 0.00263 0.00259 0.00790 0.04087 0.405 ... 0.07008 0.02764 19.517 0 0.448439 0.657899 -6.538586 0.121952 2.657476 0.133050
191 209.516 253.017 89.488 0.00564 0.00003 0.00331 0.00292 0.00994 0.02751 0.263 ... 0.04812 0.01810 19.147 0 0.431674 0.683244 -6.195325 0.129303 2.784312 0.168895
192 174.688 240.005 74.287 0.01360 0.00008 0.00624 0.00564 0.01873 0.02308 0.256 ... 0.03804 0.10715 17.883 0 0.407567 0.655683 -6.787197 0.158453 2.679772 0.131728
193 198.764 396.961 74.904 0.00740 0.00004 0.00370 0.00390 0.01109 0.02296 0.241 ... 0.03794 0.07223 19.020 0 0.451221 0.643956 -6.744577 0.207454 2.138608 0.123306
194 214.289 260.277 77.973 0.00567 0.00003 0.00295 0.00317 0.00885 0.01884 0.190 ... 0.03078 0.04398 21.209 0 0.462803 0.664357 -5.724056 0.190667 2.555477 0.148569

195 rows × 23 columns

In [27]:
#spliting data into x and y 
x = df.drop(labels=['status'], axis = 1)
display(x.head())
y = df['status']
display(y.head())
MDVP:Fo(Hz) MDVP:Fhi(Hz) MDVP:Flo(Hz) MDVP:Jitter(%) MDVP:Jitter(Abs) MDVP:RAP MDVP:PPQ Jitter:DDP MDVP:Shimmer MDVP:Shimmer(dB) ... MDVP:APQ Shimmer:DDA NHR HNR RPDE DFA spread1 spread2 D2 PPE
0 119.992 157.302 74.997 0.00784 0.00007 0.00370 0.00554 0.01109 0.04374 0.426 ... 0.02971 0.06545 0.02211 21.033 0.414783 0.815285 -4.813031 0.266482 2.301442 0.284654
1 122.400 148.650 113.819 0.00968 0.00008 0.00465 0.00696 0.01394 0.06134 0.626 ... 0.04368 0.09403 0.01929 19.085 0.458359 0.819521 -4.075192 0.335590 2.486855 0.368674
2 116.682 131.111 111.555 0.01050 0.00009 0.00544 0.00781 0.01633 0.05233 0.482 ... 0.03590 0.08270 0.01309 20.651 0.429895 0.825288 -4.443179 0.311173 2.342259 0.332634
3 116.676 137.871 111.366 0.00997 0.00009 0.00502 0.00698 0.01505 0.05492 0.517 ... 0.03772 0.08771 0.01353 20.644 0.434969 0.819235 -4.117501 0.334147 2.405554 0.368975
4 116.014 141.781 110.655 0.01284 0.00011 0.00655 0.00908 0.01966 0.06425 0.584 ... 0.04465 0.10470 0.01767 19.649 0.417356 0.823484 -3.747787 0.234513 2.332180 0.410335

5 rows × 22 columns

0    1
1    1
2    1
3    1
4    1
Name: status, dtype: int64
In [28]:
#spliting data into xtrain, ytrain and xtest, ytest
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2,random_state = 21)
In [29]:
print(x.shape, y.shape)
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)
(195, 22) (195,)
(156, 22) (156,) (39, 22) (39,)
In [30]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
In [31]:
#Create a Logistic regression Model
from sklearn.linear_model import LogisticRegression as logReg
LR = logReg().fit(x_train, y_train)
#predict on train
LR_pred_train = LR.predict(x_train)
#predict on test
LR_pred = LR.predict(x_test)
print("Model accuracy on test is: " , accuracy_score(y_test, LR_pred))
print('-'*50)
#CONFUSION MATRIX
print("confusion_matrix train is \n")
display( confusion_matrix(y_train, LR_pred_train))
print("confusion_matrix test is \n", confusion_matrix(y_test, LR_pred))
print("\n Classification Report Train is")
print(classification_report(y_train, LR_pred_train))
print("\n Classification Report Test is")
print(classification_report(y_test, LR_pred))
Model accuracy on test is:  0.8974358974358975
--------------------------------------------------
confusion_matrix train is 

array([[ 21,  18],
       [  6, 111]], dtype=int64)
confusion_matrix test is 
 [[ 6  3]
 [ 1 29]]

 Classification Report Train is
              precision    recall  f1-score   support

           0       0.78      0.54      0.64        39
           1       0.86      0.95      0.90       117

    accuracy                           0.85       156
   macro avg       0.82      0.74      0.77       156
weighted avg       0.84      0.85      0.84       156


 Classification Report Test is
              precision    recall  f1-score   support

           0       0.86      0.67      0.75         9
           1       0.91      0.97      0.94        30

    accuracy                           0.90        39
   macro avg       0.88      0.82      0.84        39
weighted avg       0.89      0.90      0.89        39

In [32]:
y_test.shape
Out[32]:
(39,)
In [33]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier().fit(x_train, y_train)
RF_pred_train = RF.predict(x_train)
RF_pred_test = RF.predict(x_test)
print('Modal accuracy on train is: ', accuracy_score(y_train, RF_pred_train))
print('Modal accuracy on test is: ', accuracy_score(y_test, RF_pred_test))
print('confusion matrix on train \n', confusion_matrix(y_train, RF_pred_train))
print('confusion matrix on test \n', confusion_matrix(y_test, RF_pred_test))
print('Classification matrix on train is \n', classification_report(y_train, RF_pred_train))
print('Classifiaction matrix on test is \n', classification_report(y_test, RF_pred_test))
Modal accuracy on train is:  1.0
Modal accuracy on test is:  0.8717948717948718
confusion matrix on train 
 [[ 39   0]
 [  0 117]]
confusion matrix on test 
 [[ 6  3]
 [ 2 28]]
Classification matrix on train is 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        39
           1       1.00      1.00      1.00       117

    accuracy                           1.00       156
   macro avg       1.00      1.00      1.00       156
weighted avg       1.00      1.00      1.00       156

Classifiaction matrix on test is 
               precision    recall  f1-score   support

           0       0.75      0.67      0.71         9
           1       0.90      0.93      0.92        30

    accuracy                           0.87        39
   macro avg       0.83      0.80      0.81        39
weighted avg       0.87      0.87      0.87        39

In [34]:
print((y_test !=RF_pred_test).sum(),'/',((y_test == RF_pred_test).sum()+(y_test != RF_pred_test).sum()))
5 / 39
In [35]:
from sklearn import metrics
print('KappaScore is: ', metrics.cohen_kappa_score(y_test,RF_pred_test))
KappaScore is:  0.6242774566473989
In [36]:
#Decision TREE
from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier().fit(x_train, y_train)

#predict on train
DT_pred_train = DT.predict(x_train)

#predict on test
DT_pred_test = DT.predict(x_test)

#accuracy on train and test
print('accuracy on test: ', accuracy_score(y_test, DT_pred_test))
print('accuracy on train: ', accuracy_score(y_train, DT_pred_train))
print('-' * 50)

#Confusion Matrix
print('Confusion Matrix on test: \n', confusion_matrix(y_test, DT_pred_test))
print('Confusion Matrix on train: \n', confusion_matrix(y_train, DT_pred_train))
print('-' * 50)

#Classification_Report
print('Classification Report on test: \n', classification_report(y_test, DT_pred_test))
print('Classification Report on train: \n', classification_report(y_train, DT_pred_train))
print('-'*50)
accuracy on test:  0.8974358974358975
accuracy on train:  1.0
--------------------------------------------------
Confusion Matrix on test: 
 [[ 8  1]
 [ 3 27]]
Confusion Matrix on train: 
 [[ 39   0]
 [  0 117]]
--------------------------------------------------
Classification Report on test: 
               precision    recall  f1-score   support

           0       0.73      0.89      0.80         9
           1       0.96      0.90      0.93        30

    accuracy                           0.90        39
   macro avg       0.85      0.89      0.87        39
weighted avg       0.91      0.90      0.90        39

Classification Report on train: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        39
           1       1.00      1.00      1.00       117

    accuracy                           1.00       156
   macro avg       1.00      1.00      1.00       156
weighted avg       1.00      1.00      1.00       156

--------------------------------------------------
In [37]:
print((y_test!= DT_pred_test).sum(),'/',((y_test==DT_pred_test).sum()+(y_test != DT_pred_test).sum()))
print('-'*50)
print('Kappa Score is:', metrics.cohen_kappa_score(y_test, DT_pred_test))
4 / 39
--------------------------------------------------
Kappa Score is: 0.731958762886598
In [38]:
#Naive bayes
from sklearn.naive_bayes import GaussianNB
NB = GaussianNB().fit(x_train, y_train)

#predict on train
NB_pred_train = NB.predict(x_train)

#predict on test
NB_pred_test = NB.predict(x_test)

#accuracy on train and test
print('accuracy on test: ', accuracy_score(y_test, NB_pred_test))
print('accuracy on train: ', accuracy_score(y_train, NB_pred_train))
print('-' * 50)

#Confusion Matrix
print('Confusion Matrix on test: \n', confusion_matrix(y_test, NB_pred_test))
print('Confusion Matrix on train: \n', confusion_matrix(y_train, NB_pred_train))
print('-' * 50)

#Classification_Report
print('Classification Report on test: \n', classification_report(y_test, NB_pred_test))
print('Classification Report on train: \n', classification_report(y_train, NB_pred_train))
print('-'*50)
accuracy on test:  0.7435897435897436
accuracy on train:  0.7115384615384616
--------------------------------------------------
Confusion Matrix on test: 
 [[ 8  1]
 [ 9 21]]
Confusion Matrix on train: 
 [[37  2]
 [43 74]]
--------------------------------------------------
Classification Report on test: 
               precision    recall  f1-score   support

           0       0.47      0.89      0.62         9
           1       0.95      0.70      0.81        30

    accuracy                           0.74        39
   macro avg       0.71      0.79      0.71        39
weighted avg       0.84      0.74      0.76        39

Classification Report on train: 
               precision    recall  f1-score   support

           0       0.46      0.95      0.62        39
           1       0.97      0.63      0.77       117

    accuracy                           0.71       156
   macro avg       0.72      0.79      0.69       156
weighted avg       0.85      0.71      0.73       156

--------------------------------------------------
In [39]:
print((y_test!= NB_pred_test).sum(),'/',((y_test==NB_pred_test).sum()+(y_test != NB_pred_test).sum()))
print('-'*50)
print('Kappa Score is:', metrics.cohen_kappa_score(y_test, NB_pred_test))
10 / 39
--------------------------------------------------
Kappa Score is: 0.4491525423728814
In [40]:
# K Neighbour Classifier
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier().fit(x_train, y_train)

#predict on train
KNN_pred_train = KNN.predict(x_train.values)

#predict on test
KNN_pred_test = KNN.predict(x_test.values)

#accuracy on train and test
print('accuracy on test: ', accuracy_score(y_test, KNN_pred_test))
print('accuracy on train: ', accuracy_score(y_train, KNN_pred_train))
print('-' * 50)

#Confusion Matrix
print('Confusion Matrix on test: \n', confusion_matrix(y_test, KNN_pred_test))
print('Confusion Matrix on train: \n', confusion_matrix(y_train, KNN_pred_train))
print('-' * 50)

#Classification_Report
print('Classification Report on test: \n', classification_report(y_test, KNN_pred_test))
print('Classification Report on train: \n', classification_report(y_train, KNN_pred_train))
print('-'*50)
accuracy on test:  0.8205128205128205
accuracy on train:  0.8910256410256411
--------------------------------------------------
Confusion Matrix on test: 
 [[ 4  5]
 [ 2 28]]
Confusion Matrix on train: 
 [[ 28  11]
 [  6 111]]
--------------------------------------------------
Classification Report on test: 
               precision    recall  f1-score   support

           0       0.67      0.44      0.53         9
           1       0.85      0.93      0.89        30

    accuracy                           0.82        39
   macro avg       0.76      0.69      0.71        39
weighted avg       0.81      0.82      0.81        39

Classification Report on train: 
               precision    recall  f1-score   support

           0       0.82      0.72      0.77        39
           1       0.91      0.95      0.93       117

    accuracy                           0.89       156
   macro avg       0.87      0.83      0.85       156
weighted avg       0.89      0.89      0.89       156

--------------------------------------------------
In [41]:
print((y_test!= KNN_pred_test).sum(),'/',((y_test==KNN_pred_test).sum()+(y_test != KNN_pred_test).sum()))
print('-'*50)
print('Kappa Score is:', metrics.cohen_kappa_score(y_test, KNN_pred_test))
7 / 39
--------------------------------------------------
Kappa Score is: 0.4276729559748428
In [42]:
#SVM
from sklearn.svm import SVC
SVM = SVC(kernel = 'linear')
SVM.fit(x_train, y_train)

#predict on train
SVM_pred_train = SVM.predict(x_train)

#predict on test
SVM_pred_test = SVM.predict(x_test)

#accuracy on train and test
print('accuracy on test: ', accuracy_score(y_test, SVM_pred_test))
print('accuracy on train: ', accuracy_score(y_train, SVM_pred_train))
print('-' * 50)

#Confusion Matrix
print('Confusion Matrix on test: \n', confusion_matrix(y_test, SVM_pred_test))
print('Confusion Matrix on train: \n', confusion_matrix(y_train, SVM_pred_train))
print('-' * 50)

#Classification_Report
print('Classification Report on test: \n', classification_report(y_test, SVM_pred_test))
print('Classification Report on train: \n', classification_report(y_train, SVM_pred_train))
print('-'*50)
accuracy on test:  0.8974358974358975
accuracy on train:  0.8717948717948718
--------------------------------------------------
Confusion Matrix on test: 
 [[ 5  4]
 [ 0 30]]
Confusion Matrix on train: 
 [[ 21  18]
 [  2 115]]
--------------------------------------------------
Classification Report on test: 
               precision    recall  f1-score   support

           0       1.00      0.56      0.71         9
           1       0.88      1.00      0.94        30

    accuracy                           0.90        39
   macro avg       0.94      0.78      0.83        39
weighted avg       0.91      0.90      0.89        39

Classification Report on train: 
               precision    recall  f1-score   support

           0       0.91      0.54      0.68        39
           1       0.86      0.98      0.92       117

    accuracy                           0.87       156
   macro avg       0.89      0.76      0.80       156
weighted avg       0.88      0.87      0.86       156

--------------------------------------------------
In [43]:
print((y_test!= SVM_pred_test).sum(),'/',((y_test==SVM_pred_test).sum()+(y_test != SVM_pred_test).sum()))
print('-'*50)
print('Kappa Score is:', metrics.cohen_kappa_score(y_test, SVM_pred_test))
4 / 39
--------------------------------------------------
Kappa Score is: 0.6578947368421053
In [44]:
import pickle 
# Saving model to disk
pickle.dump(SVM,open('deploy_SVM.pkl','wb'))
# Open the Pickle File 
model=pickle.load(open('deploy_SVM.pkl','rb'))
# Prediction 
print (model.predict (x_train))
[1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 0 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1
 0 1 1 1 1 1 0 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1
 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 0 1 1 1 0 0 1 1
 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1
 1 1 1 1 1 1 1 1]
In [ ]: